library(ggplot2)
library(GGally)
library(gridExtra)
## Loading required package: grid
library(psych)
## 
## Attaching package: 'psych'
## 
## The following object is masked from 'package:ggplot2':
## 
##     %+%
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:GGally':
## 
##     nasa
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(scales)
## 
## Attaching package: 'scales'
## 
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale
library(memisc)
## Loading required package: lattice
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 
## Attaching package: 'memisc'
## 
## The following object is masked from 'package:scales':
## 
##     percent
## 
## The following objects are masked from 'package:dplyr':
## 
##     collect, query, rename
## 
## The following objects are masked from 'package:stats':
## 
##     contr.sum, contr.treatment, contrasts
## 
## The following objects are masked from 'package:base':
## 
##     as.array, trimws
theme_set(theme_minimal(20))

Univariate Plot Section

setwd('~/repos/nanodgreep3')
wdf <- read.csv('wineQualityReds.csv')
wdf <- wdf[,-1] # Get rid of unused column : 'X'
dim(wdf)
## [1] 1599   12
names(wdf)
##  [1] "fixed.acidity"        "volatile.acidity"     "citric.acid"         
##  [4] "residual.sugar"       "chlorides"            "free.sulfur.dioxide" 
##  [7] "total.sulfur.dioxide" "density"              "pH"                  
## [10] "sulphates"            "alcohol"              "quality"
str(wdf)
## 'data.frame':    1599 obs. of  12 variables:
##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...
levels(factor(wdf$quality))
## [1] "3" "4" "5" "6" "7" "8"
summary(wdf)
##  fixed.acidity   volatile.acidity  citric.acid    residual.sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00      
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00      
##  Median :0.07900   Median :14.00       Median : 38.00      
##  Mean   :0.08747   Mean   :15.87       Mean   : 46.47      
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00      
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00      
##     density             pH          sulphates         alcohol     
##  Min.   :0.9901   Min.   :2.740   Min.   :0.3300   Min.   : 8.40  
##  1st Qu.:0.9956   1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50  
##  Median :0.9968   Median :3.310   Median :0.6200   Median :10.20  
##  Mean   :0.9967   Mean   :3.311   Mean   :0.6581   Mean   :10.42  
##  3rd Qu.:0.9978   3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10  
##  Max.   :1.0037   Max.   :4.010   Max.   :2.0000   Max.   :14.90  
##     quality     
##  Min.   :3.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.636  
##  3rd Qu.:6.000  
##  Max.   :8.000

The average quality is 5.6 and median of quality is 6.0. About 75% of wines have quality score under or equal to 6. About 75% of wines have fixed acidity less than 10.0. About 75% of wines have residual sugar less than 2.6, but its maximum value, 15.5 is very high(i.e. very sweet). All wines have similar density, from 0.99 to 1.00.

qplot(x = quality, data = wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Quality have all integer values. Most wines have 5 or 6 score value. It can be categorized as three grades.

# Adding a simplified categorical varaible for the wine quality (NG, GD, EX)
wdf['taste'] <- "GD"
wdf$taste[wdf$quality == 3 | wdf$quality == 4] <- "NG" 
wdf$taste[wdf$quality == 5 | wdf$quality == 6] <- "GD" 
wdf$taste[wdf$quality == 7 | wdf$quality == 8] <- "EX" 
wdf$taste <- factor(wdf$taste, levels=c("NG", "GD", "EX"))

# Adding a categorical variable corresponding to quality variable(3~8)
wdf['taste.detail'] <- factor(wdf$quality, levels = c("3", "4", "5", "6", "7", "8"))
qplot(x = taste, data = wdf)

summary(wdf$taste)
##   NG   GD   EX 
##   63 1319  217
str(wdf$taste)
##  Factor w/ 3 levels "NG","GD","EX": 2 2 2 2 2 2 2 3 3 2 ...
ggplot(wdf, aes(x = taste)) +  
  geom_bar(aes(y = (..count..)/sum(..count..))) + 
  ylab('Percentage')

summary(wdf$fixed.acidity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.60    7.10    7.90    8.32    9.20   15.90
qplot(x = fixed.acidity, data = wdf, binwidth=0.25)

Most wines have a fixed.acidity between 7 and 14.

summary(wdf$volatile.acidity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1200  0.3900  0.5200  0.5278  0.6400  1.5800
qplot(x = volatile.acidity, data = wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

qplot(x = volatile.acidity, data = wdf, binwidth=0.01) + 
  coord_cartesian(xlim = c(0.1, 1.0))

Again, most of wines have a volatile.acidity between 0.2 and 1.0.

summary(wdf$citric.acid)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.090   0.260   0.271   0.420   1.000
qplot(x = citric.acid, data = wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

qplot(x = citric.acid, data = wdf, binwidth=0.01)

table(wdf$citric.acid == 0)
## 
## FALSE  TRUE 
##  1467   132

About 10% of wines have no citric.acid.

summary(wdf$residual.sugar)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.900   1.900   2.200   2.539   2.600  15.500
qplot( x= residual.sugar, data = wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

qplot( x= residual.sugar, data = wdf, binwidth = 0.1) +
  coord_cartesian(xlim = c(0.9, 12.0))

qplot(x=residual.sugar, 
      data = wdf, 
      binwidth = 0.1) + 
  coord_cartesian(xlim = c(0.9, 12.0), ylim=c(1, 150)) +
  scale_y_log10()
## Warning: Stacking not well defined when ymin != 0

Residual sugar of most wines varies from 1.0 to 7.0.

summary(wdf$chlorides)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01200 0.07000 0.07900 0.08747 0.09000 0.61100
qplot(x=chlorides, data = wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

qplot(x=chlorides, data = wdf, binwidth = 0.001) + 
  coord_cartesian(xlim = c(0.05, 0.2))

qplot(x=chlorides, 
      data = wdf, 
      binwidth = 0.001) + 
  coord_cartesian(xlim = c(0.05, 0.2), ylim = c(1, 100)) +
  scale_y_log10()
## Warning: Stacking not well defined when ymin != 0

Most wines have a chlorides ranging from 0.05 to 0.2.

summary(wdf$free.sulfur.dioxide)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   14.00   15.87   21.00   72.00
qplot(x=free.sulfur.dioxide, data=wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

qplot(x=free.sulfur.dioxide, data=wdf, binwidth=1) +
  coord_cartesian(xlim=c(0, 60))

Most wines have free.sulfur.dioxide under 40.

summary(wdf$total.sulfur.dioxide)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.00   22.00   38.00   46.47   62.00  289.00
qplot(x=total.sulfur.dioxide, data=wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

qplot(x=total.sulfur.dioxide, data=wdf, binwidth = 1) + 
  coord_cartesian(xlim=c(5, 160))

Again, most wine have total.sulfur.dioxide from 5 to 160.

summary(wdf$density)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.9901  0.9956  0.9968  0.9967  0.9978  1.0040
qplot(x=density, data=wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## Warning: position_stack requires constant width: output may be incorrect

qplot(x=density, data=wdf, binwidth=0.0001)
## Warning: position_stack requires constant width: output may be incorrect

var(wdf$density)
## [1] 3.562029e-06

Most wine have almost same density, because its variance is 3.5e-06, so small.

summary(wdf$pH)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.740   3.210   3.310   3.311   3.400   4.010
qplot(x=pH, data=wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

qplot(x=pH, data=wdf, binwidth = 0.01)

Of course, all wines are acidic(i.e. under pH 7), because all wines have pH from 2.5 to 4.010.

summary(wdf$sulphates)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3300  0.5500  0.6200  0.6581  0.7300  2.0000
qplot(x=sulphates, data=wdf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

qplot(x=sulphates, data=wdf, binwidth = 0.01) +
  coord_cartesian(xlim=c(0.3, 1.5))

Also, most wines’ sulphates have range from 0.3 to 1.2.

What is the structure of your dataset?

  1. 1599 wines in the data set with 13 features(I added one more feature, ‘taste’)
    • fixed.acidity
    • volatile.acidity
    • citric.acid
    • residual.sugar
    • chlorides
    • free.sulfur.dioxide
    • total.sulfur.dioxide
    • density
    • pH
    • sulphates
    • alcohol
    • quality
    • taste
    • taste.details
  2. Every feature has numerical type except taste and taste.detail. taste have ordered factor with 3 levels, “NG”, “GD”, and “EX”, and taste.detail have ordered factor with 6 levels.

What is/are the main feature(s) of interest in your dataset?

The main features in this data are alcohol, residual.sugar, and quality(taste). I want to verify that which features among alcohol and residual.sugar determine a wine’s better flavor.

What other features in the dataset do you think will help support your investigation into your feature(s) of interest?

Some people enjoy citric.acid flavor, or density. Therefore, two features can be one of factors for good taste.

Did you create any new variables from existing variables in the dataset?

I created a variable for ‘taste’ using quality variable. Because quality has only integer variable, so I think converting it to a categorical variable is good idea. In addition, I simplified 6 steps(3~8) to 3 steps(NG, GD, EX).

Of the features you investigated, were there any unusual distributions? Did you perform any operations on the data to tidy, adjust, or change the form of the data? If so, why did you do this?

The quality variable is not tidy. It have just integer values. I think there is no need to use the feature as integer. So I decided to convert it to ordered factor with 3-levels.

Bivariate Plots Section

##                citric.acid residual.sugar    density     alcohol
## citric.acid      1.0000000     0.14357716  0.3649472  0.10990325
## residual.sugar   0.1435772     1.00000000  0.3552834  0.04207544
## density          0.3649472     0.35528337  1.0000000 -0.49617977
## alcohol          0.1099032     0.04207544 -0.4961798  1.00000000
## quality          0.2263725     0.01373164 -0.1749192  0.47616632
##                    quality
## citric.acid     0.22637251
## residual.sugar  0.01373164
## density        -0.17491923
## alcohol         0.47616632
## quality         1.00000000
# I don't need to extract all information from the data
# It will require large computation to a computer.
pairs.panels(wdf[, c(3, 4, 8, 11, 12)], pch=".")

The alcohol feature has the most impact to quality. It is not ideal correlation, but it is quite high. Interesting thing is that residual.sugar is not correlated with quality. That menas the sweet wine doesn’t implies better flavored wine.

# I wanted to remove outliers, so I used quantile function for filtering 1% of data.
qplot(x=residual.sugar, y=quality, data=wdf)

ggplot(aes(x=residual.sugar, y=quality), data=wdf) + 
  geom_jitter(alpha = 0.33)

ggplot(aes(x=residual.sugar, y=quality), 
       data=subset(wdf, residual.sugar > 0 & 
                        residual.sugar <= quantile(wdf$residual.sugar, 0.99))) + 
  geom_jitter(alpha = 0.33) +
  geom_smooth(method = 'lm', se = T, color = 'blue')

Actually, residual.sugar cannot show a tendecy with quality feature. It turns out that sugar and quality don’t have much relationship.

# I wanted to remove outliers, so I used quantile function for filtering 1% of data.
sugarAndQuality <- lm(quality ~ residual.sugar,
                    data=subset(wdf, residual.sugar > 0 & 
                                     residual.sugar <= quantile(wdf$residual.sugar, 0.99)))
summary(sugarAndQuality)
## 
## Call:
## lm(formula = quality ~ residual.sugar, data = subset(wdf, residual.sugar > 
##     0 & residual.sugar <= quantile(wdf$residual.sugar, 0.99)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6743 -0.6319  0.3560  0.3717  2.3778 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     5.60528    0.05277 106.220   <2e-16 ***
## residual.sugar  0.01211    0.01993   0.608    0.544    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.809 on 1581 degrees of freedom
## Multiple R-squared:  0.0002335,  Adjusted R-squared:  -0.0003989 
## F-statistic: 0.3692 on 1 and 1581 DF,  p-value: 0.5435

R^2 describes that sugar and quality have almost zero relationship.

# I wanted to remove outliers, so I used quantile function for filtering 1% of data.
qplot(x=alcohol, y=quality, data=wdf)

ggplot(aes(x=alcohol, y=quality), data=subset(wdf, alcohol > 0 & 
                                                   alcohol <= quantile(wdf$alcohol, 0.99))) + 
  geom_jitter(alpha = 0.33) +
  geom_smooth(method = 'lm', se = T, color = 'blue')

This plot shows a tendency syaing that the more alcohol, the higer quality values.

# I wanted to remove outliers, so I used quantile function for filtering 1% of data.
alcholAndQuality <- lm(quality ~ alcohol,
                    data=subset(wdf, alcohol > 0 & 
                                     alcohol <= quantile(wdf$alcohol, 0.99)))
summary(alcholAndQuality)
## 
## Call:
## lm(formula = quality ~ alcohol, data = subset(wdf, alcohol > 
##     0 & alcohol <= quantile(wdf$alcohol, 0.99)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8535 -0.4077 -0.1848  0.5180  2.5923 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.76698    0.18234   9.691   <2e-16 ***
## alcohol      0.37150    0.01746  21.275   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.708 on 1583 degrees of freedom
## Multiple R-squared:  0.2224, Adjusted R-squared:  0.2219 
## F-statistic: 452.6 on 1 and 1583 DF,  p-value: < 2.2e-16

R^2 value is 0.22, that means alcohol explains about 22% of the wine quality.

pairs.panels(wdf[, c(1, 3, 5, 6, 7, 9, 10, 12)], pch=".")

Chemically, acid has low pH values. In this matrix plot, we can verify the fact. We can observe that fixed.acidity and citric.acid have negative correlation with pH.

ggplot(aes(x=citric.acid, y=quality), data=wdf) + 
  geom_jitter(alpha=0.5) +
  geom_smooth(method = 'lm', se = T, color = 'blue')

# I wanted to remove outliers, so I used quantile function for filtering 1% of data.
ggplot(aes(x=sulphates, y=quality), 
       data=subset(wdf, sulphates > 0 & 
                        sulphates <= quantile(wdf$sulphates, 0.99))) + 
  geom_jitter(alpha=0.5) +
  geom_smooth(method = 'lm', se = T, color = 'blue')

In addition, sulphates and citric.acid has also relatively high correlation value with quality.

# I wanted to remove outliers, so I used quantile function for filtering 1% of data.
citricAndQuality <- lm(quality ~ citric.acid,
                    data=subset(wdf, citric.acid > 0 & 
                                     citric.acid <= quantile(wdf$citric.acid, 0.99)))
summary(citricAndQuality)
## 
## Call:
## lm(formula = quality ~ citric.acid, data = subset(wdf, citric.acid > 
##     0 & citric.acid <= quantile(wdf$citric.acid, 0.99)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9979 -0.6018  0.1152  0.4642  2.5962 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.37549    0.03904 137.697  < 2e-16 ***
## citric.acid  0.94312    0.11449   8.238 3.88e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7807 on 1449 degrees of freedom
## Multiple R-squared:  0.04474,    Adjusted R-squared:  0.04408 
## F-statistic: 67.86 on 1 and 1449 DF,  p-value: 3.877e-16
# I wanted to remove outliers, so I used quantile function for filtering 1% of data.
sulphatesAndQuality <- lm(quality ~ sulphates,
                    data=subset(wdf, sulphates > 0 & 
                                     sulphates <= quantile(wdf$sulphates, 0.99)))
summary(sulphatesAndQuality)
## 
## Call:
## lm(formula = quality ~ sulphates, data = subset(wdf, sulphates > 
##     0 & sulphates <= quantile(wdf$sulphates, 0.99)))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.02595 -0.51097 -0.02595  0.47064  2.39707 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.44423    0.09018   49.28   <2e-16 ***
## sulphates    1.83920    0.13573   13.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7653 on 1581 degrees of freedom
## Multiple R-squared:  0.1041, Adjusted R-squared:  0.1035 
## F-statistic: 183.6 on 1 and 1581 DF,  p-value: < 2.2e-16

However, their R^2 values do not exaplain much information with quality.

Bivariate Analysis

Talk about some of the relationships you observed in this part of the investigation. How did the feature(s) of interest vary with other features in the dataset?

In the alcohol case, as its value is higer, I can see the better quality wine. However, in residual.sugar case, it was surprising. At first, I thought sweet flavor can be main factor of high quality wine, but it turned out they are totally not related.

Did you observe any interesting relationships between the other features (not the main feature(s) of interest)?

pH is correlated with fixed.acidity and citric.acid, and it makes sense by chemical principle. They correlated with each other, that is, they can be repeated information.

What was the strongest relationship you found?

The strongest relationship is alcohol. It seems to affect quality value by statistics analysis. The others, including sugar, citric acid, and sulphaste are too weak to discuss the relation with quality.

Multivariate Plots Section

I converted quality to categorical features, taste(3-levels). - NG : quality 3,4 - GD : quality 5,6 - EX : quality 7,8

And, I just converted quality to factor variable, taste.detail.

These four plots shows the followings :

qplot(x=taste, y=alcohol, data=wdf, geom='boxplot')

qplot(x=taste, y=residual.sugar, data=wdf, geom='boxplot')

qplot(x=taste, y=citric.acid, data=wdf, geom='boxplot')

qplot(x=taste, y=sulphates, data=wdf, geom='boxplot')

qplot(x=taste.detail, y=alcohol, data=wdf, geom='boxplot')

qplot(x=taste.detail, y=residual.sugar, data=wdf, geom='boxplot')

qplot(x=taste.detail, y=citric.acid, data=wdf, geom='boxplot')

qplot(x=taste.detail, y=sulphates, data=wdf, geom='boxplot')

In this time, I tried to show boxplot for conveying same trend as the previous.

qplot(x=alcohol, y = citric.acid, color=taste, data=wdf)

Nothing stands out in the plot above. I want to compare only NG and EX, not GD. GD cases are too many.

qplot(x=alcohol, y = citric.acid, color=taste, data=subset(wdf, taste != "GD"))

It can be divided diagonally. This proves agian that alcohol and citric.acid have positive relationship with wine quality.

m1 <- lm(I(quality) ~ I(alcohol), data = wdf)
m2 <- update(m1, ~ . + sulphates)
m3 <- update(m2, ~ . + citric.acid)
mtable(m1, m2, m3)
## 
## Calls:
## m1: lm(formula = I(quality) ~ I(alcohol), data = wdf)
## m2: lm(formula = I(quality) ~ I(alcohol) + sulphates, data = wdf)
## m3: lm(formula = I(quality) ~ I(alcohol) + sulphates + citric.acid, 
##     data = wdf)
## 
## =============================================
##                    m1        m2        m3    
## ---------------------------------------------
## (Intercept)      1.875***  1.375***  1.434***
##                 (0.175)   (0.177)   (0.176)  
## I(alcohol)       0.361***  0.346***  0.338***
##                 (0.017)   (0.016)   (0.016)  
## sulphates                  0.994***  0.814***
##                           (0.102)   (0.107)  
## citric.acid                          0.513***
##                                     (0.093)  
## ---------------------------------------------
## R-squared           0.227     0.270     0.284
## adj. R-squared      0.226     0.269     0.282
## sigma               0.710     0.690     0.684
## F                 468.267   294.988   210.501
## p                   0.000     0.000     0.000
## Log-likelihood  -1721.057 -1675.142 -1659.955
## Deviance          805.870   760.894   746.576
## AIC              3448.114  3358.284  3329.910
## BIC              3464.245  3379.793  3356.795
## N                1599      1599      1599    
## =============================================

As we added the main features to linear model, R^2 value were getting higher.

Multivariate Analysis

Talk about some of the relationships you observed in this part of the investigation. Were there features that strengthened each other in terms of looking at your feature(s) of interest?

The relationship between Red wine quality and alcohol, citric.acid, can be shown again by using categorical variables. It is more intuitive than just numerical values.

As I investigated, alcohol and citric acid are the main factor of red wine quality.

Were there any interesting or surprising interactions between features?

Trying to visualize two extreme value(NG and EX) was very successful. Particulary, if you have too many samples, it will be good idea to compare two extreme ones.


Final Plots and Summary

Plot One

Description One

When someone tries a wine, he or she has an instinct feeling one of 3 types(“not good”, “good”, or “excellent”.) Therefore, I added new variable ‘taste’ for intuitive analysis and visual simplicity.

Plot Two

Description Two

I had a reasoning that a sweet wine tends to get excellent grade, because people love sweet drinks. However, it tunred out to be wrong. Sugar is not a crucial factor to determine red wine’s quality. Every grade has almost same amount of residual sugar.

Plot Three

Description Three

Alcohol is the main reason for people to drink red wine, or other liquors, so I suspect it is main feature for high quality wine. In addition, potassium sulphate is an unique ingredient for making a red wine, so it is one of reasons that people in the world enjoy a red wine. As I expeceted, “EX”-graded red wine tends to have more alcohol, than the others have. Sulphate have a positive tendency as 3-taste grade is going higher.


Reflection

The red wine data was very tidy. So it was very convinient to handle data. Every feature have just numerical type. So I need to convert some numerical variable to categorical variable, e.g., taste and taste.detail.

It turned out that alcohol and sulphates are the main factor to determine high grade of wine, based on statistical linear modeling. However, their R square is not big enough.

For future work, we can have chances to apply other nonlinear statistic models. It will lead us to a new aspect to analyze this red wine data.